1 Set up

library(here)
# /*===== R =====*/
source(here("GitControlled/Codes/0_1_ls_packages.R"))
source(here("GitControlled/Codes/0_0_functions.R"))
# /*===== python  =====*/
# --- Python module --- #
source_python(here("GitControlled/Codes/0_0_import_modules.py"))
# --- CF-DML --- #
source_python(here("GitControlled/Codes/run_cf_dml.py"))
# --- DR-ORF --- #
source_python(here("GitControlled/Codes/run_dr_orf.py"))
# --- DML-ORF --- #
source_python(here("GitControlled/Codes/run_dml_orf.py"))

2 Preparation

2.1 Data

# /*===== Data (allocation period 3, 2008-2012 )=====*/
# --- panel data--- #
reg_data <- readRDS(here("Shared/Data/WaterAnalysis/comp_reg_dt.rds")) %>%
  .[year %in% 2008:2012 & usage <= 40,]

# library(caret)
# set.seed(3456)
# trainIndex <- createDataPartition(reg_data$treat2, p = .5,
#                                   list = FALSE,
#                                   times = 1)
# sample1 <- reg_data[trainIndex,]
# sample1[,.N, by="treat2"]

2.2 Functions

# /*===========================================*/
#'=  Function for Data Preparation=
# /*===========================================*/
prep_data <- 
  function(data, het_vars){
  # data = sample1; het_vars = c("pr_in", "gdd_in","pet_in")
  # data = sample1; het_vars = "pr_in"

  # /*===== variables =====*/
  cov_ls <- c(
  # --- weather --- #
  "pr_in", "gdd_in","pet_in",
  # --- soil --- #
  "silttotal_r", "claytotal_r", "slope_r", "ksat_r", "awc_r"
  )

  #/*--------------------------------*/
  #' ## Training data
  #/*--------------------------------*/

  # --- dependent variable --- #
  Y <- data[, usage] %>% as.array() %>% unname()
  # --- treatment indicator --- #
  T <- data[, treat2] %>% as.array() %>% unname()
  # --- heterogeneous impact dirivers --- #
  X <- data[, ..het_vars] %>% as.matrix() %>% unname()
  # --- confounders/controls --- #
  W <- data[, ..cov_ls] %>% as.matrix() %>% unname()

  #/*--------------------------------*/
  #' ## Testing data
  #/*--------------------------------*/
  X_test <- 
    lapply(
      het_vars,
      function(x) gen_pred_data(data, het_vars, target_var = x)
      ) %>%
    bind_rows()
    # .[,!"target_var"] %>%
    # as.matrix() %>%
    # unname()

  return(
    list(Y = Y, X = X, T = T, W = W, X_test = X_test)
    )
}



# /*===========================================*/
#'=  Function to run EconML =
# /*===========================================*/

#/*--------------------------------*/
#' ## DR-ORF
#/*--------------------------------*/
run_DR_orf_comp <- function(data, het_vars) {
  # data=sample1; het_vars= "pet_in"
  # data=sample1; het_vars= c("pr_in", "gdd_in","pet_in")

  dt <- prep_data(data, het_vars)
  Y <- dt$Y
  T <- dt$T
  X <- dt$X
  W <- dt$W
  X_test <- 
    dt$X_test %>%
    .[,!"target_var"] %>%
    as.matrix() %>%
    unname()


  res_dr_orf <- 
    run_dr_orf(
      Y = Y,
      T = T,
      X = X,
      W = W,
      X_test = X_test
    )

  temp <-
    dt$X_test %>%
    melt(id.var='target_var') %>%
     .[target_var==variable,]

  res_X_test_drorf <- 
    data.table(
      x = temp[, value],
      pred_te = res_dr_orf[[1]] %>% c(),
      te_lower = res_dr_orf[[2]] %>% c(),
      te_upper = res_dr_orf[[3]] %>% c(),
      variable = dt$X_test[, target_var]
    )

  # ggplot(res_X_test_drorf) +
  #   geom_line(aes(x=x, y=pred_te)) +
  #   # geom_smooth(aes(x=x, y=pred_te), se = FALSE) +
  #   geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
  #   facet_grid(~variable, scale = "free")

  return(res_X_test_drorf)
}


#/*--------------------------------*/
#' ## DML-ORF
#/*--------------------------------*/
run_DML_orf_comp <- function(data, het_vars) {
  # data=sample1; het_vars= "pet_in"
  # data=sample1; het_vars= c("pr_in", "gdd_in","pet_in")
  dt <- prep_data(data, het_vars)
  Y <- dt$Y
  T <- dt$T
  X <- dt$X
  W <- dt$W
  X_test <- 
    dt$X_test %>%
    .[,!"target_var"] %>%
    as.matrix() %>%
    unname()

  #/*--------------------------------*/
  #' ## Regression Analysis
  #/*--------------------------------*/
  res_dml_orf <- 
    run_dml_orf(
      Y = Y,
      T = T,
      X = X,
      W = W,
      X_test = X_test
    )

  temp <-
    dt$X_test %>%
    melt(id.var='target_var') %>%
     .[target_var==variable,]

  res_X_test_dmlorf <- 
    data.table(
      x = temp[, value],
      pred_te = res_dml_orf[[1]] %>% c(),
      te_lower = res_dml_orf[[2]] %>% c(),
      te_upper = res_dml_orf[[3]] %>% c(),
      variable = dt$X_test[, target_var]
    )

  # ggplot(res_X_test_dmlorf) +
  #   geom_line(aes(x=x, y=pred_te)) +
  #   # geom_smooth(aes(x=x, y=pred_te), se = FALSE) +
  #   geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4)

  return(res_X_test_dmlorf)
}
  • For DR-ORF
    • n_trees=1000
    • min_leaf_size=10
    • max_depth = 50
    • subsample_ratio = 0.7
    • propensity_model = LogisticRegression(C=1, penalty='l1', solver='saga', multi_class='auto')
    • model_Y=LassoCV(cv=3)
    • propensity_model_final=LogisticRegressionCV(cv=3, penalty='l1', solver='saga', multi_class='auto')
    • model_Y_final=WeightedLassoCV(cv=3)
  • For DML-ORF
    • n_trees=1000
    • min_leaf_size=10
    • max_depth=50
    • subsample_ratio = 0.7
    • model_T=LogisticRegression(C=1, solver='saga', multi_class='auto')
    • model_Y=Lasso(alpha=0.01)
    • model_T_final=LogisticRegressionCV(cv=3, solver='saga', multi_class='auto')
    • model_Y_final=WeightedLassoCV(cv=3)
    • discrete_treatment=True

For DML-ORF, if discrete_treatment=True (if treatment is is binary), model_T (The estimator for fitting the treatment to the features) is treated as a classifier that gives the probabilities for the treatment (such as LogisticRegression, RandomForestClassifier, GradientBoostingClassifier).


3 Regression Analysis:

3.1 Estimate the Imapct of pet_in, gdd_in, and pr_in

  • Build a model with:
    • W: “pr_in”, “gdd_in”, “pet_in”, “silttotal_r”, “claytotal_r”, “slope_r”, “ksat_r”, “awc_r”
    • X: “pr_in”, “gdd_in”,“pet_in”
  • Prediction phase
    • Pick one variable (e.g., “pr_in”). Holding other variables (e.g., “gdd_in”, “pet_in”) constant (mean value of the training data), see how the treatment effect varies with the variable.
    • Repeat this for “pr_in”, “gdd_in”,“pet_in”
# === Run DR-ORF === #
res_DR_orf_joint <- 
  run_DR_orf_comp(data=reg_data, het_vars = c("pr_in", "gdd_in","pet_in"))

saveRDS(res_DR_orf_joint, here("Shared/Results/resRegression/res_DR_orf_joint.rds"))

ggplot(res_DR_orf_joint) +
  geom_line(aes(x=x, y=pred_te)) +
  geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
  facet_grid(~variable, scale = "free")



# === Run DML-ORF === #
res_DML_orf_joint <- 
  run_DML_orf_comp(data=reg_data, het_vars = c("pr_in", "gdd_in","pet_in"))

saveRDS(res_DML_orf_joint, here("Shared/Results/resRegression/res_DML_orf_joint.rds"))


ggplot(res_DML_orf_joint) +
  geom_line(aes(x=x, y=pred_te)) +
  geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
  facet_grid(~variable, scale = "free")

3.2 Results

res_DR_orf_joint <- 
  here("Shared/Results/resRegression/res_DR_orf_joint.rds") %>%
  readRDS()

res_DML_orf_joint <-
  here("Shared/Results/resRegression/res_DML_orf_joint.rds") %>%
  readRDS()

res_joint <- 
  bind_rows(res_DR_orf_joint, res_DML_orf_joint, .id = "type") %>%
  .[,type:=ifelse(type==1, "DR-ORF", "DML-ORF")]

ggplot(res_joint) +
  geom_line(aes(x=x, y=pred_te)) +
  geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
  facet_grid(type~variable, scale = "free_x")+
  labs(title = "Estimate the Impact of pet_in, gdd_in, and pr_in, separately") +
  ylab("Treatment Effect (inches)")


3.3 Estimate the Impact of pet_in, gdd_in, and pr_in, separately

  • For each one of the variables “pr_in”, “gdd_in” and “pet_in”:
    • build model with
      • W: “pr_in”, “gdd_in”, “pet_in”, “silttotal_r”, “claytotal_r”, “slope_r”, “ksat_r”, “awc_r”
      • X: a weather variable under consideration (e.g, only “pr_in”)
    • Prediction phase
      • see how the treatment effect varies with the variable.
  • Repeat this process over “pr_in”, “gdd_in” and “pet_in”
# === Run DR-ORF === #
res_DR_orf_sep <- 
  lapply(
    c("pr_in", "gdd_in","pet_in"), 
    function(x) run_DR_orf_comp(data=reg_data, het_vars = x)) %>%
  bind_rows()

saveRDS(res_DR_orf_sep, here("Shared/Results/resRegression/res_DR_orf_sep.rds"))

# ggplot(res_DR_orf_sep) +
#   geom_line(aes(x=x, y=pred_te)) +
#   geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
#   facet_grid(~variable, scale = "free")

# === Run DML-ORF === #
res_DML_orf_sep <- 
  lapply(c("pr_in", "gdd_in","pet_in"), 
  function(x) run_DML_orf_comp(data=reg_data, het_vars = x)) %>%
  bind_rows()

saveRDS(res_DML_orf_sep, here("Shared/Results/resRegression/res_DML_orf_sep.rds"))

# ggplot(res_DML_orf_sep) +
#   geom_line(aes(x=x, y=pred_te)) +
#   geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
#   facet_grid(~variable, scale = "free")

3.4 Results

res_DR_orf_sep <- 
  here("Shared/Results/resRegression/res_DR_orf_sep.rds") %>%
  readRDS()

res_DML_orf_sep <-
  here("Shared/Results/resRegression/res_DML_orf_sep.rds") %>%
  readRDS()

res_sep <- 
  bind_rows(res_DR_orf_sep, res_DML_orf_sep, .id = "type") %>%
  .[,type:=ifelse(type==1, "DR-ORF", "DML-ORF")]

ggplot(res_sep) +
  geom_line(aes(x=x, y=pred_te)) +
  geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
  facet_grid(type~variable, scale = "free_x")+
  labs(title = "Estimate the Impact of pet_in, gdd_in, and pr_in, separately") +
  ylab("Treatment Effect (inches)")

4 CausalForest-DML Analysis

4.1 Estimate the Impact of pet_in, gdd_in, and pr_in, separately

res_cf_DML_sep <- readRDS(here("Shared/Results/resRegression/res_cf_DML_sep.rds"))

ggplot(res_cf_DML_sep) +
  geom_line(aes(x=x, y=pred_te)) +
  geom_ribbon(aes(x=x, ymin=te_lower, ymax=te_upper), alpha=0.4) +
  facet_grid(~variable, scale = "free")